Starting form the filtered table from ‘HMP_coverage.Rmd’. Run a series of analysis to look at relationships between body site and subjects.

library(reshape2)
#library(igraph)
library(dplyr)
## 
## Attaching package: 'dplyr'
## 
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
#library(biomod2)
library(e1071)
library(RColorBrewer)
library(gdata)
## gdata: read.xls support for 'XLS' (Excel 97-2004) files ENABLED.
## 
## gdata: read.xls support for 'XLSX' (Excel 2007+) files ENABLED.
## 
## Attaching package: 'gdata'
## 
## The following objects are masked from 'package:dplyr':
## 
##     combine, first, last
## 
## The following object is masked from 'package:stats':
## 
##     nobs
## 
## The following object is masked from 'package:utils':
## 
##     object.size
library(vegan)
## Loading required package: permute
## Loading required package: lattice
## This is vegan 2.3-0
library(assertthat)
source('./staph_metagenome_tools.R')

Read in data file created in earlier pipeline

dat4 <- read.table("./Data/combined")

Create data files

#list of all subjects with more than one sample
multiSubjects <- count(dat4,Subject.Id) %>% filter(n > 1) %>% select(Subject.Id ) 
dat5 <- make_subtype_matrix(dat4)
#create Hamming dist matrices with and without cutof  min value of 0.2
dat6 <- make_subtype_matrix(dat4) %>% bintr(0.2) %>% hamming.distance %>% data.frame 
dat8 <- make_subtype_matrix(dat4) %>% hamming.distance %>% data.frame 

Permanova

test for significant associations of subtype with with bodysite and subject. us e Hamming dist. matrix. Two levels, one with a beta cutoff for all samples > 0.2 and one without

set.seed(344098)
run_bs_subj_adonis(dat6,dat4$Body.site,dat4$Subject.Id)
## 
## Call:
## adonis(formula = df ~ bs_vec) 
## 
## Permutation: free
## Number of permutations: 999
## 
## Terms added sequentially (first to last)
## 
##            Df SumsOfSqs  MeanSqs F.Model     R2 Pr(>F)    
## bs_vec     13    1.1036 0.084895  3.3775 0.1187  0.001 ***
## Residuals 326    8.1943 0.025136         0.8813           
## Total     339    9.2979                  1.0000           
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Homogeneity of multivariate dispersions
## 
## Call: betadisper(d = dist(df), group = bs_vec)
## 
## No. of Positive Eigenvalues: 30
## No. of Negative Eigenvalues: 0
## 
## Average distance to median:
##               anterior nares attached keratinized gingiva 
##                        15.51                        10.95 
##                buccal mucosa                  hard palate 
##                        14.54                         0.00 
##   left retroauricular crease              palatine tonsil 
##                        15.97                        13.02 
##            posterior fornnix      right antecubital fossa 
##                        10.02                         0.00 
##  right retroauricular crease                       saliva 
##                        18.97                         0.00 
##                        stool           subgingival_plaque 
##                        10.62                         0.00 
##         supragingival plaque                tongue dorsum 
##                        14.37                        17.01 
## 
## Eigenvalues for PCoA axes:
##     PCoA1     PCoA2     PCoA3     PCoA4     PCoA5     PCoA6     PCoA7 
## 41603.032 37193.311  8469.370  3596.008  2390.837  1979.686  1506.512 
##     PCoA8 
##  1166.622 
##            Df    Sum Sq   Mean Sq        F N.Perm Pr(>F)
## Groups     13  2197.755 169.05808 5.134462    999  0.001
## Residuals 326 10733.925  32.92615       NA     NA     NA
## 
## Call:
## adonis(formula = df ~ subj_vec) 
## 
## Permutation: free
## Number of permutations: 999
## 
## Terms added sequentially (first to last)
## 
##            Df SumsOfSqs  MeanSqs F.Model      R2 Pr(>F)   
## subj_vec    1    0.1238 0.123823   4.562 0.01332  0.008 **
## Residuals 338    9.1741 0.027142         0.98668          
## Total     339    9.2979                  1.00000          
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Homogeneity of multivariate dispersions
## 
## Call: betadisper(d = dist(df), group = subj_vec)
## 
## No. of Positive Eigenvalues: 30
## No. of Negative Eigenvalues: 0
## 
## Average distance to median:
##         1         2         3         4         5         6         7 
## 9.220e+00 1.217e+01 0.000e+00 0.000e+00 0.000e+00 0.000e+00 1.039e+01 
##         8         9        10        11        12        13        14 
## 2.598e+00 7.036e+00 2.439e+01 1.187e+01 1.574e+01 1.362e+01 0.000e+00 
##        15        16        17        18        19        20        21 
## 1.765e+01 0.000e+00 0.000e+00 1.102e+01 2.291e+01 0.000e+00 9.220e+00 
##        22        23        24        25        26        27        28 
## 1.196e+01 0.000e+00 0.000e+00 1.511e+01 7.036e+00 1.183e+01 1.320e+01 
##        29        30        31        32        33        34        35 
## 1.111e+01 1.005e+01 1.044e+01 1.598e+01 1.533e+01 1.187e+01 1.312e+01 
##        36        37        38        39        40        41        42 
## 0.000e+00 1.715e+01 4.899e+00 1.604e+01 1.675e+01 1.540e+01 1.114e+01 
##        43        44        45        46        47        48        49 
## 1.143e+01 6.896e+00 0.000e+00 0.000e+00 0.000e+00 9.381e+00 0.000e+00 
##        50        51        52        53        54        55        56 
## 0.000e+00 0.000e+00 8.130e-14 0.000e+00 0.000e+00 1.334e+01 1.922e+01 
##        57        58        59        60        61        62        63 
## 1.217e+01 0.000e+00 0.000e+00 1.535e+01 0.000e+00 0.000e+00 1.285e+01 
##        64        65        66        67        68        69        70 
## 0.000e+00 1.407e+01 1.142e+01 4.372e+00 1.409e+01 1.446e+01 1.068e+01 
##        71        72        73        74        75        76        77 
## 8.138e+00 1.722e+01 1.702e+01 9.440e+00 6.146e+00 7.868e+00 1.221e+01 
##        78        79        80        81        82        83        84 
## 8.599e+00 2.082e-14 1.150e+01 1.572e+01 1.897e-14 2.273e-14 2.398e+00 
##        85        86        87        88        89        90        91 
## 0.000e+00 1.176e+01 1.407e+01 0.000e+00 0.000e+00 1.764e+01 1.149e+01 
##        92        93        94        95        96        97        98 
## 1.170e+01 1.269e+01 1.926e+01 1.633e+01 0.000e+00 1.376e+01 0.000e+00 
##        99       100       101       102       103       104       105 
## 0.000e+00 1.187e+01 0.000e+00 1.072e+01 1.130e+01 0.000e+00 0.000e+00 
##       106       107       108       109       110 
## 1.340e+01 1.025e+01 1.407e+01 0.000e+00 9.220e+00 
## 
## Eigenvalues for PCoA axes:
##     PCoA1     PCoA2     PCoA3     PCoA4     PCoA5     PCoA6     PCoA7 
## 41603.032 37193.311  8469.370  3596.008  2390.837  1979.686  1506.512 
##     PCoA8 
##  1166.622 
##            Df   Sum Sq   Mean Sq        F N.Perm Pr(>F)
## Groups    109 11149.32 102.28732 1.188271    999  0.163
## Residuals 230 19798.59  86.08081       NA     NA     NA
## 
## Call:
## adonis(formula = df ~ bs_vec + subj_vec) 
## 
## Permutation: free
## Number of permutations: 999
## 
## Terms added sequentially (first to last)
## 
##            Df SumsOfSqs  MeanSqs F.Model      R2 Pr(>F)    
## bs_vec     13    1.1036 0.084895  3.4337 0.11870  0.001 ***
## subj_vec    1    0.1589 0.158933  6.4283 0.01709  0.003 ** 
## Residuals 325    8.0353 0.024724         0.86421           
## Total     339    9.2979                  1.00000           
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
run_bs_subj_adonis(dat8,dat4$Body.site,dat4$Subject.Id)
## 
## Call:
## adonis(formula = df ~ bs_vec) 
## 
## Permutation: free
## Number of permutations: 999
## 
## Terms added sequentially (first to last)
## 
##            Df SumsOfSqs  MeanSqs F.Model      R2 Pr(>F)    
## bs_vec     13    2.7003 0.207713  12.517 0.33295  0.001 ***
## Residuals 326    5.4098 0.016594         0.66705           
## Total     339    8.1100                  1.00000           
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Homogeneity of multivariate dispersions
## 
## Call: betadisper(d = dist(df), group = bs_vec)
## 
## No. of Positive Eigenvalues: 282
## No. of Negative Eigenvalues: 0
## 
## Average distance to median:
##               anterior nares attached keratinized gingiva 
##                        56.14                        13.70 
##                buccal mucosa                  hard palate 
##                        16.22                         0.00 
##   left retroauricular crease              palatine tonsil 
##                        37.07                        19.66 
##            posterior fornnix      right antecubital fossa 
##                        11.57                         0.00 
##  right retroauricular crease                       saliva 
##                        31.96                         0.00 
##                        stool           subgingival_plaque 
##                        20.52                         0.00 
##         supragingival plaque                tongue dorsum 
##                        15.89                        25.00 
## 
## Eigenvalues for PCoA axes:
##      PCoA1      PCoA2      PCoA3      PCoA4      PCoA5      PCoA6 
## 937128.241  22487.052  15107.447   7151.909   4346.681   3264.424 
##      PCoA7      PCoA8 
##   2801.537   2365.001 
##            Df    Sum Sq  Mean Sq        F N.Perm Pr(>F)
## Groups     13  85529.29 6579.176 4.764105    999  0.028
## Residuals 326 450202.43 1380.989       NA     NA     NA
## 
## Call:
## adonis(formula = df ~ subj_vec) 
## 
## Permutation: free
## Number of permutations: 999
## 
## Terms added sequentially (first to last)
## 
##            Df SumsOfSqs MeanSqs F.Model      R2 Pr(>F)    
## subj_vec    1    0.4003 0.40029  17.549 0.04936  0.001 ***
## Residuals 338    7.7097 0.02281         0.95064           
## Total     339    8.1100                 1.00000           
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Homogeneity of multivariate dispersions
## 
## Call: betadisper(d = dist(df), group = subj_vec)
## 
## No. of Positive Eigenvalues: 282
## No. of Negative Eigenvalues: 0
## 
## Average distance to median:
##         1         2         3         4         5         6         7 
## 1.583e+01 2.864e+01 0.000e+00 0.000e+00 0.000e+00 0.000e+00 6.129e+01 
##         8         9        10        11        12        13        14 
## 8.400e+01 2.371e+01 5.065e+01 2.262e+01 5.596e+01 6.132e+01 0.000e+00 
##        15        16        17        18        19        20        21 
## 2.793e+01 0.000e+00 0.000e+00 4.584e+01 2.476e+01 0.000e+00 1.154e+01 
##        22        23        24        25        26        27        28 
## 1.040e+01 0.000e+00 0.000e+00 4.128e+01 1.480e+01 3.242e+01 2.898e+01 
##        29        30        31        32        33        34        35 
## 1.987e+01 2.840e+01 1.840e+02 3.448e+01 1.796e+01 9.816e+01 4.058e+01 
##        36        37        38        39        40        41        42 
## 0.000e+00 5.659e+01 3.433e+01 4.281e+01 2.522e+01 3.329e+01 5.673e+01 
##        43        44        45        46        47        48        49 
## 1.349e+02 7.548e+01 0.000e+00 0.000e+00 0.000e+00 6.110e+00 0.000e+00 
##        50        51        52        53        54        55        56 
## 0.000e+00 0.000e+00 4.636e+01 0.000e+00 0.000e+00 2.038e+01 3.132e+01 
##        57        58        59        60        61        62        63 
## 3.035e+01 0.000e+00 0.000e+00 2.086e+01 0.000e+00 0.000e+00 3.096e+01 
##        64        65        66        67        68        69        70 
## 0.000e+00 1.790e+01 1.863e+01 4.031e+01 1.315e+01 1.491e+01 1.939e+01 
##        71        72        73        74        75        76        77 
## 1.895e+01 2.015e+01 2.220e+01 1.800e+01 1.559e+01 2.402e+01 1.653e+01 
##        78        79        80        81        82        83        84 
## 8.463e+00 1.740e+01 1.535e+01 2.122e+01 2.757e-12 4.792e-12 2.291e+01 
##        85        86        87        88        89        90        91 
## 0.000e+00 1.888e+01 9.165e+00 0.000e+00 0.000e+00 1.204e+01 1.028e+01 
##        92        93        94        95        96        97        98 
## 3.843e+01 1.105e+01 2.403e+01 1.682e+01 0.000e+00 6.451e+01 0.000e+00 
##        99       100       101       102       103       104       105 
## 0.000e+00 1.427e+01 0.000e+00 2.148e+01 1.262e+01 0.000e+00 0.000e+00 
##       106       107       108       109       110 
## 2.277e+01 1.124e+01 9.165e+00 0.000e+00 1.563e+01 
## 
## Eigenvalues for PCoA axes:
##      PCoA1      PCoA2      PCoA3      PCoA4      PCoA5      PCoA6 
## 937128.241  22487.052  15107.447   7151.909   4346.681   3264.424 
##      PCoA7      PCoA8 
##   2801.537   2365.001 
##            Df   Sum Sq  Mean Sq        F N.Perm Pr(>F)
## Groups    109 205874.1 1888.754 1.247548    999  0.216
## Residuals 230 348213.8 1513.973       NA     NA     NA
## 
## Call:
## adonis(formula = df ~ bs_vec + subj_vec) 
## 
## Permutation: free
## Number of permutations: 999
## 
## Terms added sequentially (first to last)
## 
##            Df SumsOfSqs  MeanSqs F.Model      R2 Pr(>F)    
## bs_vec     13    2.7003 0.207713  12.911 0.33295  0.001 ***
## subj_vec    1    0.1813 0.181274  11.268 0.02235  0.001 ***
## Residuals 325    5.2285 0.016088         0.64469           
## Total     339    8.1100                  1.00000           
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Permutation tests

#test ffor whether intra-subject distance greater than intersubject
subject_perm(dat4,multiSubjects,dat6)
## Score for intraperson hits =  1499 
## [1] "Quartlies for random distribution"
##   0%  25%  50%  75% 100% 
## 1555 1649 1668 1687 1773 
## Empirical p value =  0
#now look at the same test between body sites
bs <- levels(dat4$Body.site)
by_factor_perm(bs,dat4,dat6)
## [1] "anterior nares"
## [1] "Number of samples " "79"                
## [1] "Distribution of random hits"
##   0%  25%  50%  75% 100% 
## 5848 6804 7028 7254 8150 
## [1] 6980
## Empirical p value [1] 0.4431
## 
## [1] "attached keratinized gingiva"
## [1] "Number of samples " "4"                 
## [1] "Distribution of random hits"
##   0%  25%  50%  75% 100% 
##    0   12   13   16   30 
## [1] 8
## Empirical p value [1] 0.056
## 
## [1] "buccal mucosa"
## [1] "Number of samples " "59"                
## [1] "Distribution of random hits"
##   0%  25%  50%  75% 100% 
## 3076 3748 3894 4046 4870 
## [1] 3192
## Empirical p value [1] 3e-04
## 
## Zero samples in  hard palate[1] "left retroauricular crease"
## [1] "Number of samples " "23"                
## [1] "Distribution of random hits"
##   0%  25%  50%  75% 100% 
##  376  538  574  614  782 
## [1] 636
## Empirical p value [1] 0.8603
## 
## [1] "palatine tonsil"
## [1] "Number of samples " "6"                 
## [1] "Distribution of random hits"
##   0%  25%  50%  75% 100% 
##    5   29   34   39   63 
## [1] 36
## Empirical p value [1] 0.6266
## 
## [1] "posterior fornnix"
## [1] "Number of samples " "11"                
## [1] "Distribution of random hits"
##   0%  25%  50%  75% 100% 
##   48  114  126  138  190 
## [1] 76
## Empirical p value [1] 0.0051
## 
## Zero samples in  right antecubital fossa[1] "right retroauricular crease"
## [1] "Number of samples " "28"                
## [1] "Distribution of random hits"
##   0%  25%  50%  75% 100% 
##  549  810  860  912 1127 
## [1] 1081
## Empirical p value [1] 0.9978
## 
## Zero samples in  saliva[1] "stool"
## [1] "Number of samples " "7"                 
## [1] "Distribution of random hits"
##   0%  25%  50%  75% 100% 
##    6   42   48   54   82 
## [1] 30
## Empirical p value [1] 0.0334
## 
## Zero samples in  subgingival_plaque[1] "supragingival plaque"
## [1] "Number of samples " "37"                
## [1] "Distribution of random hits"
##   0%  25%  50%  75% 100% 
## 1060 1442 1518 1594 1924 
## [1] 1164
## Empirical p value [1] 8e-04
## 
## [1] "tongue dorsum"
## [1] "Number of samples " "82"                
## [1] "Distribution of random hits"
##   0%  25%  50%  75% 100% 
## 6105 7338 7561 7797 8843 
## [1] 7815
## Empirical p value [1] 0.7673

Plots of subtype distribution

presence_mat <- as.data.frame(bintr(dat5,0.2))
top_score_mat <- as.data.frame(bintr(dat5,0.5))
# png("~/Dropbox/ARTICLES_BY_TDR/2015-staph-metagenome/HMP_barchart.png",width=640, height =640, res = 75)
# dev.off()
genotypes_plot(presence_mat,"All samples, subtypes present > 0.2")

genotypes_plot(top_score_mat,"All samples, subtypes present > 0.5")

for (i in bs) {
  bss_rows <- which(dat4$Body.site == i)
  if(length(bss_rows) > 0) {
    bs_df <- slice(presence_mat,bss_rows)
    genotypes_plot(bs_df,paste("Present: ", i))
  }
}

for (i in bs) {
  bss_rows <- which(dat4$Body.site == i)
  if(length(bss_rows) > 0) {
    bs_df <- slice(top_score_mat,bss_rows)
    genotypes_plot(bs_df,paste("Top score: ", i))
  }
}

### PCA

par(mfrow=c(2,2))
pcobj <- prcomp(dat6)
tr_gray <- rgb(0.5,.5,.5,.15)

for (i in bs) {
  prcols <- rep(tr_gray,nrow(dat6))
  prcols[which(dat4$Body.site == i)] <- "red"
  plot(pcobj$x,col = prcols, pch = 16, main = i)
}

for (i in multiSubjects$Subject.Id) {
  sub_rows = which(dat4$Subject.Id == as.character(i))
  if (length(sub_rows) > 3){
    prcols <- rep(tr_gray,nrow(dat6))
    prcols[sub_rows] <- "blue"
    plot(pcobj$x,col = prcols, pch = 16, main = c("Subject",i))
  }
}

### Session Info

sessionInfo()
## R version 3.2.1 (2015-06-18)
## Platform: x86_64-apple-darwin13.4.0 (64-bit)
## Running under: OS X 10.10.5 (Yosemite)
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] assertthat_0.1     vegan_2.3-0        lattice_0.20-33   
## [4] permute_0.8-4      gdata_2.17.0       RColorBrewer_1.1-2
## [7] e1071_1.6-7        dplyr_0.4.2        reshape2_1.4.1    
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_0.12.0     cluster_2.0.3   knitr_1.10.5    magrittr_1.5   
##  [5] MASS_7.3-43     R6_2.1.0        stringr_1.0.0   plyr_1.8.3     
##  [9] tools_3.2.1     parallel_3.2.1  grid_3.2.1      nlme_3.1-121   
## [13] mgcv_1.8-7      DBI_0.3.1       htmltools_0.2.6 class_7.3-13   
## [17] gtools_3.5.0    lazyeval_0.1.10 yaml_2.1.13     digest_0.6.8   
## [21] Matrix_1.2-2    evaluate_0.7    rmarkdown_0.7   stringi_0.5-5